class BioDSL::DegapSeq

Remove gaps from sequences or gap only columns in alignments.

degap_seq remove gaps from sequences (the letters ~-_.). If the option columns_only is used then gaps from aligned sequences will be removed, if and only if the the entire columns consists of gaps.

Usage

degap_seq([columns_only: <bool>])

Options

Examples

Consider the following FASTA entries in the file ‘test.fna`:

>test1
A-G~T.C_
>test2
AGG_T-C~

To remove all gaps from all sequences do:

BD.new.read_fasta(input: "test.fna").degap_seq.dump.run

{:SEQ_NAME=>"test1", :SEQ=>"AGTC", :SEQ_LEN=>4}
{:SEQ_NAME=>"test2", :SEQ=>"AGGTC", :SEQ_LEN=>5}

To remove all gap-only columns use the columns_only option:

BD.new.
read_fasta(input: "test.fna").
degap_seq(columns_only: true).
dump.
run

{:SEQ_NAME=>"test1", :SEQ=>"A-GTC", :SEQ_LEN=>5}
{:SEQ_NAME=>"test2", :SEQ=>"AGGTC", :SEQ_LEN=>5}

Constants

STATS

Public Class Methods

new(options) click to toggle source

Constructor for DegapSeq.

@param options [Hash] Options Hash.

@option options [Boolean] :columns_only

Flag indicating that only gap-columns only shoule be removed.

@return [DegapSeq] Instance of DegapSeq.

# File lib/BioDSL/commands/degap_seq.rb, line 85
def initialize(options)
  @options = options
  @indels  = BioDSL::Seq::INDELS.sort.join('')
  @na_mask = nil
  @max_len = nil
  @count   = 0

  check_options
end

Public Instance Methods

lmb() click to toggle source

Return the command lambda for DegapSeq.

@return [Proc] Command lambda.

# File lib/BioDSL/commands/degap_seq.rb, line 98
def lmb
  lambda do |input, output, status|
    status_init(status, STATS)

    if @options[:columns_only]
      degap_columns(input, output)
      status[:columns_removed] = @na_mask.count_false
    else
      degap_all(input, output)
    end
  end
end

Private Instance Methods

check_length(seq) click to toggle source

Check if sequence length match max_len.

@param seq [String] Sequences.

@raise [BioDSL::SeqError] if sequence length and max_len don’t match.

# File lib/BioDSL/commands/degap_seq.rb, line 175
def check_length(seq)
  return if @max_len == seq.length
  fail BioDSL::SeqError,
       "Uneven seq lengths: #{@max_len} != #{seq.length}"
end
check_options() click to toggle source

Check options.

# File lib/BioDSL/commands/degap_seq.rb, line 114
def check_options
  options_allowed(@options, :columns_only)
  options_allowed_values(@options, columns_only: [true, false, nil])
end
create_mask() click to toggle source

Create a mask for all-gap columns.

# File lib/BioDSL/commands/degap_seq.rb, line 182
def create_mask
  @na_mask = @na_mask.ne @count
end
degap_all(input, output) click to toggle source

Remove all gaps from all sequences in input stream and output to output stream.

@param input [Enumerator] Input stream. @param output [Enumerator::Yeilder] Output stream.

# File lib/BioDSL/commands/degap_seq.rb, line 222
def degap_all(input, output)
  input.each do |record|
    @status[:records_in] += 1

    degap_seq(record) if record.key? :SEQ

    output << record

    @status[:records_out] += 1
  end
end
degap_columns(input, output) click to toggle source

Remove all gap-only columns from all sequences in input stream and output to output stream.

@param input [Enumerator] Input stream. @param output [Enumerator::Yeilder] Output stream.

# File lib/BioDSL/commands/degap_seq.rb, line 124
def degap_columns(input, output)
  TmpDir.create('degap_seq') do |tmp_file, _|
    process_input(input, tmp_file)
    create_mask
    process_output(output, tmp_file)
  end
end
degap_seq(record) click to toggle source

Given a BioDSL record with sequence information, remove all gaps from the sequence.

@param record [Hash] BioDSL record.

# File lib/BioDSL/commands/degap_seq.rb, line 238
def degap_seq(record)
  entry = BioDSL::Seq.new_bp(record)

  @status[:sequences_in] += 1
  @status[:residues_in] += entry.length

  entry.seq.delete!(@indels)

  @status[:sequences_out] += 1
  @status[:residues_out] += entry.length

  record.merge! entry.to_bp
end
mask_add(seq) click to toggle source

Add sequence gaps to mask.

@param seq [String] Sequences.

# File lib/BioDSL/commands/degap_seq.rb, line 157
def mask_add(seq)
  @status[:sequences_in] += 1
  @status[:residues_in] += seq.length

  @max_len ||= seq.length

  check_length(seq)

  @na_mask ||= NArray.int(seq.length)
  na_seq = NArray.to_na(seq, 'byte')
  @indels.each_char { |c| @na_mask += na_seq.eq(c.ord) }
end
process_input(input, tmp_file) click to toggle source

Serialize all input record to a temporary file and at the same time add all sequence type records to the gap mask.

@param input [Enumerator] Input stream. @param tmp_file [String] Path to temporary file.

# File lib/BioDSL/commands/degap_seq.rb, line 137
def process_input(input, tmp_file)
  File.open(tmp_file, 'wb') do |ios|
    BioDSL::Serializer.new(ios) do |s|
      input.each do |record|
        @status[:records_in] += 1

        if (seq = record[:SEQ])
          mask_add(seq)
          @count += 1
        end

        s << record
      end
    end
  end
end
process_output(output, tmp_file) click to toggle source

Read all serialized records from the temporary file and emit to the output stream records with degapped sequences.

@param output [Enumerator::Yeilder] Output stream. @param tmp_file [String] Path to temporary file.

# File lib/BioDSL/commands/degap_seq.rb, line 191
def process_output(output, tmp_file)
  File.open(tmp_file, 'rb') do |ios|
    BioDSL::Serializer.new(ios) do |s|
      s.each do |record|
        remove_residues(record) if record[:SEQ]

        output << record
        @status[:records_out] += 1
      end
    end
  end
end
remove_residues(record) click to toggle source

Given a BioDSL record containing sequence information remove all residues based on the na_mask.

@param record [Hash] BioDSL record.

# File lib/BioDSL/commands/degap_seq.rb, line 208
def remove_residues(record)
  na_seq           = NArray.to_na(record[:SEQ], 'byte')
  record[:SEQ]     = na_seq[@na_mask].to_s
  record[:SEQ_LEN] = record[:SEQ].length

  @status[:sequences_out] += 1
  @status[:residues_out] += record[:SEQ].length
end